#Importing Data Set
ss14pusa = read.csv(file.choose(),header=TRUE)  # read csv file
ss14pusb = read.csv(file.choose(),header=TRUE)

#Libraries
require(plyr)
require(dplyr)
require(rbokeh)
require(ggplot2)
#Importing Data Set in a faster way
library(data.table)
variables=c("SEX","OCCP","WKHP","MAR","WAGP","SCHL","ESP","RAC1P","ST","PAOC","PWGTP")
ss14pusa=fread("~/Google Drive/Data for R/csv_pus/ss14pusa.csv",head=TRUE,select=variables)
ss14pusb=fread("~/Google Drive/Data for R/csv_pus/ss14pusb.csv",head=TRUE,select=variables)
attach(ss14pusa)
names(ss14pusa)
<<<<<<< HEAD
ss14pusa_edit = data.frame(SEX,OCCP,WKHP,MAR,WAGP,SCHL,ESP, RAC1P, ST, PAOC, PWGTP, AGEP)
detach(ss14pusa)

attach(ss14pusb)
ss14pusb_edit = data.frame(SEX,OCCP,WKHP,MAR,WAGP,SCHL,ESP, RAC1P, ST, PAOC, PWGTP, AGEP)
=======
ss14pusa_edit = data.frame(SEX,OCCP,WKHP,MAR,WAGP,SCHL,ESP, RAC1P, ST, PAOC, PWGTP)
detach(ss14pusa)

attach(ss14pusb)
ss14pusb_edit = data.frame(SEX,OCCP,WKHP,MAR,WAGP,SCHL,ESP, RAC1P, ST, PAOC, PWGTP)
>>>>>>> master
detach(ss14pusb)

Data = rbind(ss14pusa_edit,ss14pusb_edit)
colnames(Data) <- c("Gender","Occupation", "Work_hours", "Marriage", "Income", "Education" ,"Parental_Occupation", "Race", "State", "Children", "Weight", "Age")


write.csv(Data, file = "Data.csv",row.names=TRUE)
#Delete income=0/NA rows
row_to_keep=which(Data$Income>0)
Data=Data[row_to_keep,]
#Recode Region
#https://en.wikipedia.org/wiki/List_of_regions_of_the_United_States#/media/File:Census_Regions_and_Division_of_the_United_States.svg
<<<<<<< HEAD
Data$Region <- recode(Data$State,c(09,23,25,33,44,59,34,36,42)='Northeast';c(17,18,26,38,55,19,20,27,29,31,38,46)='Mid West'; c(10,12,13,24,37,45,51,11,54,01,21,28,47,05,22,40,48)='South';c(04,08,16,30,32,35,49,50,02,06,15,41,53)='West'))
=======
require(car)
Data$Region <- recode(Data$State,"c(09,23,25,33,44,59,34,36,42)='Northeast';c(17,18,26,38,55,19,20,27,29,31,38,46,39)='Midwest'; c(10,12,13,24,37,45,51,11,54,01,21,28,47,05,22,40,48)='South';c(04,08,16,30,32,35,49,50,02,06,15,41,53,56)='West';'72'='Puerto Rico'")
# Recode Division
Data$Division <- recode(Data$State,"c(53,41,6,2,15)='Pacific';c(4,8,16,30,32,35,49,56)='Mountain'; c(19,20,27,29,31,38,46)='West North Central';c(17,18,26,39,55)='East North Central';c(5,22,40,48)='West South Central';c(1,21,28,47)='East South Central';c(34,36,42)='Middle Atlantic';c(10,11,12,13,24,37,45,51,54)='South Atlantic';c(9,23,25,33,44,50)='New England'")

master

#Recode Education
Data$Education <- recode(Data$Education,"c(01,02,03,04,05,06,07,08,09,10,11)='~greade8';c(12,13,14,15,16,17,18,19)='grade9~college_nodegree';c(20,21)='associate/bachelor';c(22,23)='master/professional';c(24)='doctor')")

#Recode Marriage
Data$Marriage <- as.factor(Data$Marriage)
levels(Data$Marriage) <- c("Married", "Widowed", "Divorced", "Seperated", "Never Married")
#Recode Education
library(car)
Data$Education <- recode(Data$Education,"1:11='~greade8';12:19='grade9~college_nodegree';20:21='associate/bachelor';22:23='master/professional';24='doctor'")

summary(lm(MEAN ~ Marriage + Age, data = Data_wed_SCI))

Managers

Women_MGR<-filter(Data_women, Occupation ==“MGR”) Data_wed_MGR <- ddply(Women_MGR, .(Marriage, Age), summarise, MEAN = weighted.mean(Income, Weight, na.rm = T))

ggplot(Data_wed_MGR,aes(y = MEAN, x =Age,colour=Marriage,shape=Marriage)) + geom_point() + geom_smooth(method=“loess”, fill=NA)

mgr <- figure(width = NULL, height = NULL) %>% ly_points(Age, MEAN, data = Data_wed_MGR, color = Marriage, glyph = as.factor(Marriage), hover = list(Marriage, Age)) mgr

Lawyers

Women_LGL<-filter(Data_women, Occupation ==“LGL”) Data_wed_LGL <- ddply(Women_LGL, .(Marriage, Age), summarise, MEAN = weighted.mean(Income, Weight, na.rm = T))

ggplot(Data_wed_LGL,aes(y = MEAN, x =Age,colour=Marriage,shape=Marriage)) + geom_point() + geom_smooth(method=“loess”, fill=NA)

lgl <- figure(width = NULL, height = NULL) %>% ly_points(Age, MEAN, data = Data_wed_LGL, color = Marriage, glyph = as.factor(Marriage), hover = list(Marriage, Age)) lgl

summary(lm(MEAN ~ Marriage + Age, data = Data_wed_LGL))

Business

Women_BUS<-filter(Data_women, Occupation ==“BUS”) Data_wed_BUS <- ddply(Women_BUS, .(Marriage, Age), summarise, MEAN = weighted.mean(Income, Weight, na.rm = T))

ggplot(Data_wed_BUS,aes(y = MEAN, x =Age,colour=Marriage,shape=Marriage)) + geom_point() + geom_smooth(method=“loess”, fill=NA)

bus <- figure(width = NULL, height = NULL) %>% ly_points(Age, MEAN, data = Data_wed_BUS, color = Marriage, hover = list(Marriage, Age)) bus

summary(lm(MEAN ~ Marriage + Age, data = Data_wed_BUS))

Computer

Women_CMM<-filter(Data_women, Occupation ==“CMM”) Data_wed_CMM <- ddply(Women_CMM, .(Marriage, Age), summarise, MEAN = weighted.mean(Income, Weight, na.rm = T))

ggplot(Data_wed_CMM,aes(y = MEAN, x =Age,colour=Marriage,shape=Marriage)) + geom_point() + geom_smooth(method=“loess”, fill=NA)

cmm <- figure(width = NULL, height = NULL) %>% ly_points(Age, MEAN, data = Data_wed_CMM, color = Marriage, hover = list(Marriage, Age))

cmm

summary(lm(MEAN ~ Marriage + Age, data = Data_wed_CMM))

bounds <- range(c(Data_wed_occp\(Marriage, Data_wed_occp\)MEAN ))

grid_plot(lapply(split(Data_wed_occp, as.factor(Data_wed_occp$Occupation)), function(d) { figure(width = 300, height = 350) %>% ly_points(Age, MEAN, data = Data_wed_occp, color = Marriage, hover = list(Marriage, Age)) }), nrow = 2, same_axes = TRUE)

ggplot(Data_wed_occp,aes(x = factor(Occupation), fill = factor(Marriage), y = MEAN)) + geom_bar(stat = “identity”) + geom_text(position = “stack”,aes(y = MEAN, label = Marriage), size = 2, check_overlap = TRUE)

require(plotly) p <- plot_ly(Data_wed_occp, x = Age, y = MEAN,group = Occupation, text = paste(“Clarity:”, Age), mode = “markers”, xaxis = paste0(“x”, Occupation), symbol = Marriage, jitter = .9)

p<- subplot(p) ``` ???

master

MotherWorking=select(Data_women, Income,Children,Work_hours)
#detach(package:plyr)
GroupedMotherWorking <-
  MotherWorking %>%
  na.omit() %>%
  group_by(Children,Work_hours) %>% 
  summarize(
    AvgIncome = mean(Income),
    count=n()
  ) 

library(plotly)
# note how size is automatically scaled and added as hover text

plot_ly(GroupedMotherWorking, x = Work_hours, y = AvgIncome,size=sqrt(count), color = Children,text = paste("Count: ", count),opacity=1-Children*0.2,mode = "markers")

#plot_ly(GroupedMotherWorking, x = Work_hours, y = AvgIncome, text = paste("Count: ", #count),mode = "markers", opacity=1-Children*0.2,color = Children)

Observations: 1. When work_hours smaller than 60Hrs, Avg Income tend to be positively related to work hour. 2. There are generally more females with no Children 3. Females with Children from 6 to 17 often has higher avg income

ggplotly(p)

ggplotly(p)
Error in file(file, ifelse(append, "a", "w")) : 
  cannot open the connection

Observations: 1. Females with Children from 6 to 17 usually tend to have higher average income as the dot has darker color. 2. Occupation of ENG, MGR and CMM usually tend to have higher salary. On the other hand, female in LGL field always have lower salary 3.

From the graphs we see that the Northeast region, especially the New England Division, has higher income level than other parts of the States. The next question is why it happens, e.g. particular popular high-income occupations in certain regions?

---
title: "Women in Work Force"
output: html_notebook
---

```{r} 
#Importing Data Set
ss14pusa = read.csv(file.choose(),header=TRUE)  # read csv file
ss14pusb = read.csv(file.choose(),header=TRUE)

#Libraries
require(plyr)
require(dplyr)
require(rbokeh)
require(ggplot2)
```

```{r}
#Importing Data Set in a faster way
library(data.table)
variables=c("SEX","OCCP","WKHP","MAR","WAGP","SCHL","ESP","RAC1P","ST","PAOC","PWGTP")
ss14pusa=fread("~/Google Drive/Data for R/csv_pus/ss14pusa.csv",head=TRUE,select=variables)
ss14pusb=fread("~/Google Drive/Data for R/csv_pus/ss14pusb.csv",head=TRUE,select=variables)

```

```{r}
attach(ss14pusa)
names(ss14pusa)
<<<<<<< HEAD
ss14pusa_edit = data.frame(SEX,OCCP,WKHP,MAR,WAGP,SCHL,ESP, RAC1P, ST, PAOC, PWGTP, AGEP)
detach(ss14pusa)

attach(ss14pusb)
ss14pusb_edit = data.frame(SEX,OCCP,WKHP,MAR,WAGP,SCHL,ESP, RAC1P, ST, PAOC, PWGTP, AGEP)
=======
ss14pusa_edit = data.frame(SEX,OCCP,WKHP,MAR,WAGP,SCHL,ESP, RAC1P, ST, PAOC, PWGTP)
detach(ss14pusa)

attach(ss14pusb)
ss14pusb_edit = data.frame(SEX,OCCP,WKHP,MAR,WAGP,SCHL,ESP, RAC1P, ST, PAOC, PWGTP)
>>>>>>> master
detach(ss14pusb)

Data = rbind(ss14pusa_edit,ss14pusb_edit)
colnames(Data) <- c("Gender","Occupation", "Work_hours", "Marriage", "Income", "Education" ,"Parental_Occupation", "Race", "State", "Children", "Weight", "Age")


write.csv(Data, file = "Data.csv",row.names=TRUE)
```

```{r}
#Delete income=0/NA rows
row_to_keep=which(Data$Income>0)
Data=Data[row_to_keep,]
```

```{r, echo = FALSE}
#recode OCCP
class(Data$Occupation) <- "numeric"
Data$Occupation <- ifelse(Data$Occupation >= 10 & Data$Occupation <= 430, 1, Data$Occupation)
Data$Occupation <- ifelse(Data$Occupation >= 1005 & Data$Occupation <= 1240, 2, Data$Occupation)
Data$Occupation <- ifelse(Data$Occupation >= 800 & Data$Occupation <= 950, 3, Data$Occupation)
Data$Occupation <- ifelse(Data$Occupation >= 2105 & Data$Occupation <= 2160, 4, Data$Occupation)
Data$Occupation <- ifelse(Data$Occupation >= 3000 & Data$Occupation <= 3540, 5, Data$Occupation)
Data$Occupation <- ifelse(Data$Occupation >= 510 & Data$Occupation <= 740, 6, Data$Occupation)
Data$Occupation <- ifelse(Data$Occupation >= 1300 & Data$Occupation <= 1560, 7, Data$Occupation)
Data$Occupation <- ifelse(Data$Occupation >= 1600 & Data$Occupation <= 1965, 8, Data$Occupation)

Data <- filter(Data, Occupation %in% c(1:8))
Data$Occupation <- as.factor(Data$Occupation)

levels(Data$Occupation) <- c('MGR', 'CMM', 'FIN', 'LGL', 'MED' , 'BUS', 'ENG', 'SCI')
```

```{r}
#Recode Region
#https://en.wikipedia.org/wiki/List_of_regions_of_the_United_States#/media/File:Census_Regions_and_Division_of_the_United_States.svg
<<<<<<< HEAD
Data$Region <- recode(Data$State,c(09,23,25,33,44,59,34,36,42)='Northeast';c(17,18,26,38,55,19,20,27,29,31,38,46)='Mid West'; c(10,12,13,24,37,45,51,11,54,01,21,28,47,05,22,40,48)='South';c(04,08,16,30,32,35,49,50,02,06,15,41,53)='West'))
=======
require(car)
Data$Region <- recode(Data$State,"c(09,23,25,33,44,59,34,36,42)='Northeast';c(17,18,26,38,55,19,20,27,29,31,38,46,39)='Midwest'; c(10,12,13,24,37,45,51,11,54,01,21,28,47,05,22,40,48)='South';c(04,08,16,30,32,35,49,50,02,06,15,41,53,56)='West';'72'='Puerto Rico'")
```
```{r}
# Recode Division
Data$Division <- recode(Data$State,"c(53,41,6,2,15)='Pacific';c(4,8,16,30,32,35,49,56)='Mountain'; c(19,20,27,29,31,38,46)='West North Central';c(17,18,26,39,55)='East North Central';c(5,22,40,48)='West South Central';c(1,21,28,47)='East South Central';c(34,36,42)='Middle Atlantic';c(10,11,12,13,24,37,45,51,54)='South Atlantic';c(9,23,25,33,44,50)='New England'")
```
>>>>>>> master

```{r}
#Recode Education
Data$Education <- recode(Data$Education,"c(01,02,03,04,05,06,07,08,09,10,11)='~greade8';c(12,13,14,15,16,17,18,19)='grade9~college_nodegree';c(20,21)='associate/bachelor';c(22,23)='master/professional';c(24)='doctor')")

#Recode Marriage
Data$Marriage <- as.factor(Data$Marriage)
levels(Data$Marriage) <- c("Married", "Widowed", "Divorced", "Seperated", "Never Married")
```

```{r}
#Recode Education
library(car)
Data$Education <- recode(Data$Education,"1:11='~greade8';12:19='grade9~college_nodegree';20:21='associate/bachelor';22:23='master/professional';24='doctor'")
```

```{r}
# On Survey Weights
# http://tophcito.blogspot.co.at/2014/04/social-science-goes-r-weighted-survey.html We assign everyone that did make it into the sample a weight. So people that turn out too often in the sample receive a weight of less than 1. And those that we were not able to reach enough of are upweighted with a weight larger than 1. Respondents that belong to groups that have been sampled perfectly receive a weight of 1. This solution is called post-stratification, because it computes weights based on group (or stratum) characteristics, like the distribution of age or gender proportions.

require(survey)
Data.w = svydesign(ids = ~1, data = Data, weights = Data$Weight)
summary(Data.w)

# comparison of the sex ratios in the unweighted and the weighted data frames:
prop.table(table(Data$Gender))
prop.table(svytable(~Gender, design = Data.w))

# Run this to see how it works
table(Data$Gender)
svytable(~Gender, design = Data.w)

# Take my interested variable 'State' as an example, the difference between the unweighted and weighted ratio is really small
prop.table(table(Data$State)) - prop.table(svytable(~State, design = Data.w))

# Ultimately, the difference for each income level between unweighted and weighted is small too
prop.table(table(Data$Income)) - prop.table(svytable(~Income, design = Data.w))

```

```{r, echo = FALSE}
#add in weights...example code from previous project:
require(survey)

df1<-svrepdesign(variables=pus_new[,1:13], 
repweights=pus_new[,14:93], type="BRR",combined.weights=TRUE,
weights=pus_new$PWGTP)
summary(df1)
svymean(~ WAGP,df1, na.rm = T)

#If We only want to know the SE and Mean of Statistics Income?

pus_new$FOD1P[pus_new$FOD1P == 3702 | pus_new$FOD1P == 6212 | pus_new$FOD1P == 6202]<-"Statistics"
df<-subset(pus_new, pus_new$FOD1P=="Statistics")
df2<-svrepdesign(variables=df[,1:13], 
repweights=df[,14:93], type="BRR",combined.weights=TRUE,
weights=df$PWGTP)
summary(df2)
svymean(~ WAGP,df2, na.rm = T)
```

```{r, echo = FALSE}

#median wage for each occupation by gender (just to acknowledge that there *is* a gender gap)
<<<<<<< Updated upstream
=======
<<<<<<< HEAD
require(ggplot2)
=======
>>>>>>> Stashed changes
#factor marriage and change labels


>>>>>>> origin/master
Data_sex_occp <- ddply(Data, .(Gender, Occupation), summarise, MEAN = weighted.mean(Income, Weight, na.rm = T))
ggplot(Data_sex_occp, aes(x=Occupation, y=MEAN, fill=factor(Gender))) + 
  geom_bar(stat="identity",position="dodge") + 
  scale_fill_brewer(palette="RdYlGn") +
  labs(fill="") + 
  ylab("Mean Salary ($)") + 
  xlab("Occupations") + 
  ggtitle(paste("Salary Comparison between Men & Women")) + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1), 
        panel.background = element_rect(fill = 'white' )) + 
  theme_grey(base_size = 12)

```

```{r, echo = FALSE}
#subset on women, explore groups of women
Data_women <- filter(Data, Gender == 2)
Data_women <- filter(Data_women, Age > 20 & Age < 66)

#Race - Catherine
#Motherhood - 
#Marital Status - Erica (potentially add in region)
#Proportion of graduate degrees in each occupation?
#Region - Vivien

```

```{r, echo = FALSE}
<<<<<<< Updated upstream
=======
<<<<<<< HEAD
=======
>>>>>>> Stashed changes
#marital status, subset at specific age to control for experience
Women_40 <- subset(Data_women, Data_women$Age == 40)
Data_women$Agegroup <- cut(as.numeric(Data_women$Age), breaks = c(20,25,30,35,40,45,50,55,60,65))

Data_wed_occp <- ddply(Data_women, .(Marriage, Occupation, Age), summarise, MEAN = weighted.mean(Income, Weight, na.rm = T))

plot(Data_wed_occp$Age, Data_wed_occp$MEAN)

summary(lm(MEAN ~ Marriage + Age + I(Age^2)) , data = Data_wed_occp))

ggplot(Data_wed_occp,aes(y = MEAN, x =Age,colour=Marriage,shape=Marriage)) +
geom_point() + geom_smooth(method="loess", fill=NA) + facet_wrap(~Occupation, scales= "free")

#Finance
Women_FIN<-filter(Data_women, Occupation =="FIN")
Data_wed_FIN <- ddply(Women_FIN, .(Marriage, Age), summarise, MEAN = weighted.mean(Income, Weight, na.rm = T))

ggplot(Data_wed_FIN,aes(y = MEAN, x =Age,colour=Marriage,shape=Marriage)) +
geom_point() + geom_smooth(method="loess", fill=NA)

figure(width = NULL, height = NULL) %>%
  ly_points(Age, MEAN, data = Data_wed_FIN,
    color = Marriage,
    hover = list(Marriage, Age))

summary(lm(MEAN ~ Marriage + Age + I(Age^2), data = Data_wed_FIN))

Women_ENG<-filter(Data_women, Occupation =="ENG")
Data_wed_ENG <- ddply(Women_ENG, .(Marriage, Age), summarise, MEAN = weighted.mean(Income, Weight, na.rm = T))

ggplot(Data_wed_ENG,aes(y = MEAN, x =Age,colour=Marriage,shape=Marriage)) +
geom_point() + geom_smooth(method="loess", fill=NA)

eng <-figure(width = NULL, height = NULL) %>%
  ly_lines(Agegroup, MEAN, data = Data_wed_ENG,
    color = Marriage, glyph = as.factor(Age),
    hover = list(Marriage, Age))

eng

lm1 <- lm(MEAN ~ Marriage + Age + I(Age^2), data = Data_wed_ENG)
summary(lm1)

#medical field

Women_MED<-filter(Data_women, Occupation =="MED")
Data_wed_MED <- ddply(Women_MED, .(Marriage, Age), summarise, MEAN = weighted.mean(Income, Weight, na.rm = T))

ggplot(Data_wed_MED,aes(y = MEAN, x =Age,colour=Marriage,shape=Marriage)) +
geom_point() + geom_smooth(method="loess", fill=NA)

med <- figure(width = NULL, height = NULL) %>%
  ly_points(Age, MEAN, data = Data_wed_MED,
    color = Marriage, glyph = as.factor(Marriage),
    hover = list(Marriage, Age))
med

summary(lm(MEAN ~ Marriage + Age, data = Data_wed_MED))

#scientist

Women_SCI<-filter(Data_women, Occupation =="SCI")
Data_wed_SCI <- ddply(Women_SCI, .(Marriage, Age), summarise, MEAN = weighted.mean(Income, Weight, na.rm = T))

ggplot(Data_wed_SCI,aes(y = MEAN, x =Age,colour=Marriage,shape=Marriage)) +
geom_point() + geom_smooth(method="loess", fill=NA)

sci <- figure(width = NULL, height = NULL) %>%
  ly_points(Age, MEAN, data = Data_wed_SCI,
    color = Marriage, glyph = as.factor(Marriage),
    hover = list(Marriage, Age))
sci
<<<<<<< HEAD

summary(lm(MEAN ~ Marriage + Age, data = Data_wed_SCI))

#Managers
Women_MGR<-filter(Data_women, Occupation =="MGR")
Data_wed_MGR <- ddply(Women_MGR, .(Marriage, Age), summarise, MEAN = weighted.mean(Income, Weight, na.rm = T))

ggplot(Data_wed_MGR,aes(y = MEAN, x =Age,colour=Marriage,shape=Marriage)) +
geom_point() + geom_smooth(method="loess", fill=NA)

mgr <- figure(width = NULL, height = NULL) %>%
  ly_points(Age, MEAN, data = Data_wed_MGR,
    color = Marriage, glyph = as.factor(Marriage),
    hover = list(Marriage, Age))
mgr

#Lawyers

Women_LGL<-filter(Data_women, Occupation =="LGL")
Data_wed_LGL <- ddply(Women_LGL, .(Marriage, Age), summarise, MEAN = weighted.mean(Income, Weight, na.rm = T))

ggplot(Data_wed_LGL,aes(y = MEAN, x =Age,colour=Marriage,shape=Marriage)) +
geom_point() + geom_smooth(method="loess", fill=NA)

lgl <- figure(width = NULL, height = NULL) %>%
  ly_points(Age, MEAN, data = Data_wed_LGL,
    color = Marriage, glyph = as.factor(Marriage),
    hover = list(Marriage, Age))
lgl

summary(lm(MEAN ~ Marriage + Age, data = Data_wed_LGL))


#Business

Women_BUS<-filter(Data_women, Occupation =="BUS")
Data_wed_BUS <- ddply(Women_BUS, .(Marriage, Age), summarise, MEAN = weighted.mean(Income, Weight, na.rm = T))

ggplot(Data_wed_BUS,aes(y = MEAN, x =Age,colour=Marriage,shape=Marriage)) +
geom_point() + geom_smooth(method="loess", fill=NA)

bus <- figure(width = NULL, height = NULL) %>%
  ly_points(Age, MEAN, data = Data_wed_BUS,
    color = Marriage,
    hover = list(Marriage, Age))
bus

summary(lm(MEAN ~ Marriage + Age, data = Data_wed_BUS))
<<<<<<< Updated upstream

#Computer

=======

#Computer

>>>>>>> Stashed changes
Women_CMM<-filter(Data_women, Occupation =="CMM")
Data_wed_CMM <- ddply(Women_CMM, .(Marriage, Age), summarise, MEAN = weighted.mean(Income, Weight, na.rm = T))

ggplot(Data_wed_CMM,aes(y = MEAN, x =Age,colour=Marriage,shape=Marriage)) +
geom_point() + geom_smooth(method="loess", fill=NA)

cmm <- figure(width = NULL, height = NULL) %>%
  ly_points(Age, MEAN, data = Data_wed_CMM,
    color = Marriage,
    hover = list(Marriage, Age))

cmm 

summary(lm(MEAN ~ Marriage + Age, data = Data_wed_CMM))

bounds <- range(c(Data_wed_occp$Marriage, Data_wed_occp$MEAN ))

grid_plot(lapply(split(Data_wed_occp, as.factor(Data_wed_occp$Occupation)), function(d) {
  figure(width = 300, height = 350) %>%
    ly_points(Age, MEAN, data = Data_wed_occp,
    color = Marriage,
    hover = list(Marriage, Age))
}), nrow = 2, same_axes = TRUE)



require(plotly)
p <- plot_ly(Data_wed_occp, x = Age, y = MEAN,group = Occupation, text = paste("Clarity: ", Age),
        mode = "markers",  xaxis = paste0("x", Occupation), symbol = Marriage, jitter = .9)

p<- subplot(p)
```


summary(lm(MEAN ~ Marriage + Age, data = Data_wed_SCI))

#Managers
Women_MGR<-filter(Data_women, Occupation =="MGR")
Data_wed_MGR <- ddply(Women_MGR, .(Marriage, Age), summarise, MEAN = weighted.mean(Income, Weight, na.rm = T))

ggplot(Data_wed_MGR,aes(y = MEAN, x =Age,colour=Marriage,shape=Marriage)) +
geom_point() + geom_smooth(method="loess", fill=NA)

mgr <- figure(width = NULL, height = NULL) %>%
  ly_points(Age, MEAN, data = Data_wed_MGR,
    color = Marriage, glyph = as.factor(Marriage),
    hover = list(Marriage, Age))
mgr

#Lawyers

Women_LGL<-filter(Data_women, Occupation =="LGL")
Data_wed_LGL <- ddply(Women_LGL, .(Marriage, Age), summarise, MEAN = weighted.mean(Income, Weight, na.rm = T))

ggplot(Data_wed_LGL,aes(y = MEAN, x =Age,colour=Marriage,shape=Marriage)) +
geom_point() + geom_smooth(method="loess", fill=NA)

lgl <- figure(width = NULL, height = NULL) %>%
  ly_points(Age, MEAN, data = Data_wed_LGL,
    color = Marriage, glyph = as.factor(Marriage),
    hover = list(Marriage, Age))
lgl

summary(lm(MEAN ~ Marriage + Age, data = Data_wed_LGL))


#Business

Women_BUS<-filter(Data_women, Occupation =="BUS")
Data_wed_BUS <- ddply(Women_BUS, .(Marriage, Age), summarise, MEAN = weighted.mean(Income, Weight, na.rm = T))

ggplot(Data_wed_BUS,aes(y = MEAN, x =Age,colour=Marriage,shape=Marriage)) +
geom_point() + geom_smooth(method="loess", fill=NA)

bus <- figure(width = NULL, height = NULL) %>%
  ly_points(Age, MEAN, data = Data_wed_BUS,
    color = Marriage,
    hover = list(Marriage, Age))
bus

summary(lm(MEAN ~ Marriage + Age, data = Data_wed_BUS))

#Computer

Women_CMM<-filter(Data_women, Occupation =="CMM")
Data_wed_CMM <- ddply(Women_CMM, .(Marriage, Age), summarise, MEAN = weighted.mean(Income, Weight, na.rm = T))

ggplot(Data_wed_CMM,aes(y = MEAN, x =Age,colour=Marriage,shape=Marriage)) +
geom_point() + geom_smooth(method="loess", fill=NA)

cmm <- figure(width = NULL, height = NULL) %>%
  ly_points(Age, MEAN, data = Data_wed_CMM,
    color = Marriage,
    hover = list(Marriage, Age))

cmm 

summary(lm(MEAN ~ Marriage + Age, data = Data_wed_CMM))

bounds <- range(c(Data_wed_occp$Marriage, Data_wed_occp$MEAN ))

grid_plot(lapply(split(Data_wed_occp, as.factor(Data_wed_occp$Occupation)), function(d) {
  figure(width = 300, height = 350) %>%
    ly_points(Age, MEAN, data = Data_wed_occp,
    color = Marriage,
    hover = list(Marriage, Age))
}), nrow = 2, same_axes = TRUE)



ggplot(Data_wed_occp,aes(x = factor(Occupation), fill = factor(Marriage), 
y = MEAN)) +
 geom_bar(stat = "identity")  +
   geom_text(position = "stack",aes(y = MEAN, label = Marriage), size = 2, check_overlap = TRUE)


require(plotly)
p <- plot_ly(Data_wed_occp, x = Age, y = MEAN,group = Occupation, text = paste("Clarity: ", Age),
        mode = "markers",  xaxis = paste0("x", Occupation), symbol = Marriage, jitter = .9)

p<- subplot(p)
``` ???

>>>>>>> master

```{r, echo = FALSE}
>>>>>>> origin/master
Data_women <- filter(Data, Gender == 2)
MotherHood=select(Data_women, Income,Children)
#detach(package:plyr)
GroupedMotherHood <-
  MotherHood %>%
  na.omit() %>%
  group_by(Children) %>% 
  summarize(
    AvgIncome = mean(Income)
  ) 


### Bar Chart
ggplot(GroupedMotherHood, aes(x=Children, y=AvgIncome)) + 
  geom_bar(colour="black", fill="#DD8888", width=.5, stat="identity") + 
  guides(fill=FALSE) +
  xlab("Children") + ylab("Mean Income") +
  coord_cartesian(ylim = c(50000,70000)) +
  #scale_y_continuous(limits = c(40000,70000),oob = rescale_none) +
  ggtitle("Income for Motherhood")



### 
```

```{r}
MotherWorking=select(Data_women, Income,Children,Work_hours)
#detach(package:plyr)
GroupedMotherWorking <-
  MotherWorking %>%
  na.omit() %>%
  group_by(Children,Work_hours) %>% 
  summarize(
    AvgIncome = mean(Income),
    count=n()
  ) 

library(plotly)
# note how size is automatically scaled and added as hover text

plot_ly(GroupedMotherWorking, x = Work_hours, y = AvgIncome,size=sqrt(count), color = Children,text = paste("Count: ", count),opacity=1-Children*0.2,mode = "markers")

#plot_ly(GroupedMotherWorking, x = Work_hours, y = AvgIncome, text = paste("Count: ", #count),mode = "markers", opacity=1-Children*0.2,color = Children)
```
Observations:
1. When work_hours smaller than 60Hrs, Avg Income tend to be positively related to work hour.
2. There are generally more females with no Children
3. Females with Children from 6 to 17 often has higher avg income
```{r}
MotherOccupation=select(Data_women, Income,Children,Occupation)
#detach(package:plyr)
GroupedMotherOccupation <-
  MotherOccupation %>%
  na.omit() %>%
  group_by(Children,Occupation) %>% 
  summarize(
    AvgIncome = mean(Income),
    count=n()
  ) 


revalue(GroupedMotherOccupation$Children,c('1'='Children under 6','2'='Children from 6 to 17','3'='Children under 6 and above 6','4'='No Children'))

ggplot(GroupedMotherOccupation,aes(x=Children))+geom_point(aes(y=Occupation,size=count,colour=AvgIncome))+scale_colour_gradient(low="green",high="red")+scale_x_continuous(breaks=1:4, labels=c("Children under 6", "Children from 6 to 17", "Children under 6 and above 6", "No Children")) 

#scale_colour_gradientn(colours = terrain.colors(10))
```
Observations:
1. Females with Children from 6 to 17 usually tend to have higher average income as the dot has darker color.
2. Occupation of ENG, MGR and CMM usually tend to have higher salary. On the other hand, female in LGL field always have lower salary
3. 


```{r,echo = FALSE, message=FALSE, warning=FALSE, results='hold'}

# Regional Effect
# wanna know how to use survey weights
# calculate median of women in each State and plot in mapview
# Investigate in Occupation or other vavriables to see why

DW=Data[Data$Gender==2, ]

# boxplot of each state/division/region to see distribution
require(plotly)
plot_ly(DW, x = Income, color = Region, type = "box")
plot_ly(DW, x = Income, color = Division, type = "box")
boxplot(DW$Income~DW$State) 

# calculate median of women in each Division/Region/State

require(dplyr)

DivMedian <- aggregate(Income~Division,Data_women, median)
RegMedian <- aggregate(Income~Region,Data_women, median)
StateMedian <- aggregate(Income~State,DW, median)

# Recode State Names
require(readr)

statenames <- read_csv("~/Google Drive/Columbia/5243 ADS/Project 1/data/statenames.csv")
names(StateMedian)[1]<-"code"
State.Median <- merge(StateMedian, statenames, by="code")
head(SM)

# plot state income median in mapview
require(googleVis)
map<-gvisGeoMap(data=State.Median, locationvar = "name", numvar="Income",options=list(region="US",dataMode="regions",width='800px',heigth='600px',gvis.editor = "Income Median of US States", colors="[0xEFEDF5, 0xA64D79]"))
plot(map)


```
From the graphs we see that the Northeast region, especially the New England Division, has higher income level than other parts of the States. The next question is why it happens, e.g. particular popular high-income occupations in certain regions?

